In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [1045]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected ABUS
In [1046]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [1047]:
pd.set_option('display.max_colwidth', None)
In [1048]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [1049]:
del df['Unnamed: 0']
In [1050]:
df.head(5)
Out[1050]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2015-08-27 7.10 7.39 6.90 7.27 7.27 708400 3.857143 1.336642 0.052163 0.706734 7.643928 6.730357 7.187143 NaN 9.721232 0.49 82.090605 NaN NaN NaN -1.680000 NaN -0.187709 25.702256 NaN NaN 56.792473 40.750362 -1.329645e+06 -56502.306404 -2915300.0 0.0 268472.000000 0.0 0.0 0.0 0.0 0.0 268472.000000 268472.000000 268472.000000 268472.000000 268472.000000 1 13 14 14 14 14 14 0
1 2015-08-28 7.15 7.48 7.15 7.36 7.36 514900 1.237966 1.463052 0.025535 0.652914 7.466735 6.827551 7.147143 NaN 8.871120 0.33 72.305598 NaN NaN NaN -1.130000 NaN -0.133098 27.743861 NaN NaN 74.425806 56.487922 -1.189217e+06 67211.743856 -2400400.0 0.0 19835.214942 0.0 0.0 0.0 0.0 0.0 19835.214942 19835.214942 19835.214942 19835.214942 19835.214942 0 4 4 4 4 4 4 0
2 2015-08-31 7.34 7.57 7.04 7.11 7.11 543400 -3.396739 1.639846 0.022620 0.635355 7.423659 6.822055 7.122857 -1.353269 8.936080 0.53 74.276650 NaN NaN NaN -1.660000 -15.246783 -0.189282 25.636512 NaN NaN 77.866669 69.694982 -1.589077e+06 -15516.329354 -2943800.0 0.0 4543.809961 0.0 0.0 0.0 0.0 0.0 4543.809961 4543.809961 4543.809961 4543.809961 4543.809961 0 3 3 3 3 3 3 0
3 2015-09-01 7.02 7.08 6.67 6.76 6.76 511500 -4.922643 1.239065 0.036212 0.607447 7.439162 6.677981 7.058571 -1.361282 8.985908 0.44 79.892536 NaN NaN NaN -1.469999 -15.575995 -0.178615 23.002285 NaN NaN 59.988412 70.760296 -1.876016e+06 -139247.689391 -3455300.0 0.0 1481.702490 0.0 0.0 0.0 0.0 0.0 1481.702490 1481.702490 1481.702490 1481.702490 1481.702490 0 2 2 2 2 2 2 0
4 2015-09-02 6.92 7.18 6.85 6.99 6.99 867100 3.402360 0.830664 0.034449 0.580669 7.438352 6.695934 7.067143 -1.383462 8.307141 0.42 68.306460 NaN NaN NaN -0.650000 -16.061621 -0.085079 28.221804 NaN NaN 41.706915 59.853999 -2.007395e+06 -219008.693824 -2588200.0 0.0 180950.239844 0.0 0.0 0.0 0.0 0.0 180950.239844 180950.239844 180950.239844 180950.239844 180950.239844 0 30 30 30 30 30 30 0
In [1051]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1517 entries, 0 to 1516
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       1517 non-null   datetime64[ns]
 1   Open                       1517 non-null   float64       
 2   High                       1517 non-null   float64       
 3   Low                        1517 non-null   float64       
 4   Close                      1517 non-null   float64       
 5   Adj Close                  1517 non-null   float64       
 6   Volume                     1517 non-null   int64         
 7   Return                     1517 non-null   float64       
 8   Beta                       1517 non-null   float64       
 9   Variance                   1517 non-null   float64       
 10  AvgTrueRange               1517 non-null   float64       
 11  Upperband                  1517 non-null   float64       
 12  Lowerband                  1517 non-null   float64       
 13  Middleband                 1517 non-null   float64       
 14  APO                        1515 non-null   float64       
 15  NATR                       1517 non-null   float64       
 16  TRANGE                     1517 non-null   float64       
 17  DMI                        1517 non-null   float64       
 18  MACD                       1507 non-null   float64       
 19  MACDSIGNAL                 1507 non-null   float64       
 20  MACDHIST                   1507 non-null   float64       
 21  MOM                        1517 non-null   float64       
 22  PPO                        1515 non-null   float64       
 23  ROCP                       1517 non-null   float64       
 24  RSI                        1517 non-null   float64       
 25  TRIX                       1456 non-null   float64       
 26  ULTOSC                     1512 non-null   float64       
 27  SLOWK                      1517 non-null   float64       
 28  SLOWD                      1517 non-null   float64       
 29  AD                         1517 non-null   float64       
 30  ADOSC                      1517 non-null   float64       
 31  OBV                        1517 non-null   float64       
 32  Upward_momentum_created    1517 non-null   float64       
 33  Downward_momentum_created  1517 non-null   float64       
 34  B5_O_Um                    1517 non-null   float64       
 35  B5_C_Um                    1517 non-null   float64       
 36  B5_E_Um                    1517 non-null   float64       
 37  B5_A_Um                    1517 non-null   float64       
 38  B5_N_Um                    1517 non-null   float64       
 39  B5_O_Dm                    1517 non-null   float64       
 40  B5_C_Dm                    1517 non-null   float64       
 41  B5_E_Dm                    1517 non-null   float64       
 42  B5_A_Dm                    1517 non-null   float64       
 43  B5_N_Dm                    1517 non-null   float64       
 44  Verified_status_True       1517 non-null   int64         
 45  Verified_status_False      1517 non-null   int64         
 46  O                          1517 non-null   int64         
 47  C                          1517 non-null   int64         
 48  E                          1517 non-null   int64         
 49  A                          1517 non-null   int64         
 50  N                          1517 non-null   int64         
 51  Real_or_Fake_tweet         1517 non-null   int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 616.4 KB
In [1052]:
df.shape
Out[1052]:
(1517, 52)
In [1053]:
sns.set(font_scale=0.8)
In [1054]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [1055]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [1056]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [1057]:
df.head()
Out[1057]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2015-08-27 7.10 7.39 6.90 7.27 7.27 708400 3.857143 1.336642 0.052163 0.706734 7.643928 6.730357 7.187143 NaN 9.721232 0.49 82.090605 NaN NaN NaN -1.680000 NaN -0.187709 25.702256 NaN NaN 56.792473 40.750362 -1.329645e+06 -56502.306404 -2915300.0 0.0 268472.000000 0.0 0.0 0.0 0.0 0.0 268472.000000 268472.000000 268472.000000 268472.000000 268472.000000 1 13 14 14 14 14 14 0 NaN NaN
1 2015-08-28 7.15 7.48 7.15 7.36 7.36 514900 1.237966 1.463052 0.025535 0.652914 7.466735 6.827551 7.147143 NaN 8.871120 0.33 72.305598 NaN NaN NaN -1.130000 NaN -0.133098 27.743861 NaN NaN 74.425806 56.487922 -1.189217e+06 67211.743856 -2400400.0 0.0 19835.214942 0.0 0.0 0.0 0.0 0.0 19835.214942 19835.214942 19835.214942 19835.214942 19835.214942 0 4 4 4 4 4 4 0 1.237966 0.012304
2 2015-08-31 7.34 7.57 7.04 7.11 7.11 543400 -3.396739 1.639846 0.022620 0.635355 7.423659 6.822055 7.122857 -1.353269 8.936080 0.53 74.276650 NaN NaN NaN -1.660000 -15.246783 -0.189282 25.636512 NaN NaN 77.866669 69.694982 -1.589077e+06 -15516.329354 -2943800.0 0.0 4543.809961 0.0 0.0 0.0 0.0 0.0 4543.809961 4543.809961 4543.809961 4543.809961 4543.809961 0 3 3 3 3 3 3 0 -3.396739 -0.034558
3 2015-09-01 7.02 7.08 6.67 6.76 6.76 511500 -4.922643 1.239065 0.036212 0.607447 7.439162 6.677981 7.058571 -1.361282 8.985908 0.44 79.892536 NaN NaN NaN -1.469999 -15.575995 -0.178615 23.002285 NaN NaN 59.988412 70.760296 -1.876016e+06 -139247.689391 -3455300.0 0.0 1481.702490 0.0 0.0 0.0 0.0 0.0 1481.702490 1481.702490 1481.702490 1481.702490 1481.702490 0 2 2 2 2 2 2 0 -4.922643 -0.050479
4 2015-09-02 6.92 7.18 6.85 6.99 6.99 867100 3.402360 0.830664 0.034449 0.580669 7.438352 6.695934 7.067143 -1.383462 8.307141 0.42 68.306460 NaN NaN NaN -0.650000 -16.061621 -0.085079 28.221804 NaN NaN 41.706915 59.853999 -2.007395e+06 -219008.693824 -2588200.0 0.0 180950.239844 0.0 0.0 0.0 0.0 0.0 180950.239844 180950.239844 180950.239844 180950.239844 180950.239844 0 30 30 30 30 30 30 0 3.402360 0.033458
In [1058]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [1059]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [1060]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [1061]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [1062]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [1063]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [1064]:
df.describe()
Out[1064]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1.456000e+03 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1.456000e+03 1.456000e+03 1.456000e+03 1456.0 1.456000e+03 1456.0 1456.0 1456.0 1456.0 1456.0 1.456000e+03 1.456000e+03 1.456000e+03 1.456000e+03 1.456000e+03 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.0 1456.000000 1456.000000 1427.000000 1420.000000
mean 3.852823 3.988063 3.706085 3.839966 3.839966 1.493650e+06 0.141592 0.503830 0.069330 0.287905 4.194042 3.490213 3.842127 -0.005112 7.892299 0.289842 35.639904 -0.006822 -0.008087 0.001265 -0.003970 -0.048906 0.017612 49.597316 -0.030799 48.383890 46.635105 46.710055 -5.412507e+07 -1.093755e+06 -2.185741e+07 0.0 1.911769e+05 0.0 0.0 0.0 0.0 0.0 1.911769e+05 1.911769e+05 1.911769e+05 1.911769e+05 1.911769e+05 0.133242 24.210852 24.344093 24.344093 24.344093 24.344093 24.344093 0.0 0.177999 -0.000173 0.054305 0.054420
std 1.844588 1.910945 1.774438 1.842730 1.842730 1.092444e+07 6.603548 0.501089 0.298857 0.171434 2.026469 1.690834 1.824600 0.395014 3.849392 0.292007 23.545583 0.273726 0.256286 0.085100 0.772013 9.424317 0.203692 13.325952 0.591980 10.448069 23.723869 22.048123 1.007101e+08 7.188866e+06 4.847996e+07 0.0 9.010427e+05 0.0 0.0 0.0 0.0 0.0 9.010427e+05 9.010427e+05 9.010427e+05 9.010427e+05 9.010427e+05 1.334231 93.159945 94.344869 94.344869 94.344869 94.344869 94.344869 0.0 6.623271 0.061497 0.030808 0.030841
min 0.870000 0.890000 0.830000 0.870000 0.870000 1.620000e+04 -46.460175 -2.059419 0.000000 0.065458 0.910470 0.312165 0.891429 -2.698782 3.307695 0.020000 0.056338 -1.440958 -1.279442 -0.630821 -5.590000 -37.683420 -0.603960 16.787461 -1.326921 19.913514 1.981568 3.113497 -4.869689e+08 -1.007706e+08 -1.277416e+08 0.0 0.000000e+00 0.0 0.0 0.0 0.0 0.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.0 -46.460175 -0.624744 0.011831 0.011831
25% 2.800000 2.900000 2.700000 2.800000 2.800000 1.498500e+05 -2.627586 0.239303 0.005605 0.176772 3.075897 2.587037 2.823571 -0.173718 5.622442 0.150000 15.672025 -0.133503 -0.128886 -0.029955 -0.310000 -5.024277 -0.093073 41.401541 -0.420763 41.358719 26.826771 28.547543 -2.427174e+07 -3.648210e+05 -1.323830e+07 0.0 1.349560e+04 0.0 0.0 0.0 0.0 0.0 1.349560e+04 1.349560e+04 1.349560e+04 1.349560e+04 1.349560e+04 0.000000 4.000000 4.000000 4.000000 4.000000 4.000000 4.000000 0.0 -2.631576 -0.026668 0.035993 0.036088
50% 3.570000 3.670000 3.430000 3.550000 3.550000 3.036000e+05 0.000000 0.489484 0.015502 0.238059 3.844928 3.276614 3.585714 -0.004391 7.061153 0.210000 33.099967 -0.007064 -0.008161 0.003051 0.000000 -0.126119 0.000000 48.980412 -0.055326 48.128664 45.881303 45.990499 -3.770521e+06 -2.138705e+04 -6.549300e+06 0.0 3.939631e+04 0.0 0.0 0.0 0.0 0.0 3.939631e+04 3.939631e+04 3.939631e+04 3.939631e+04 3.939631e+04 0.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 0.0 0.000000 0.000000 0.044847 0.045072
75% 4.390000 4.582500 4.222500 4.382500 4.382500 1.048650e+06 2.133535 0.778564 0.046110 0.328164 4.776274 4.026849 4.373929 0.162324 8.933023 0.350000 52.779046 0.098339 0.083535 0.040301 0.300000 4.812017 0.091442 58.565955 0.291668 55.333914 66.666643 64.371039 4.777705e+06 1.048516e+05 4.616325e+06 0.0 1.257554e+05 0.0 0.0 0.0 0.0 0.0 1.257554e+05 1.257554e+05 1.257554e+05 1.257554e+05 1.257554e+05 0.000000 19.000000 19.000000 19.000000 19.000000 19.000000 19.000000 0.0 2.227566 0.022031 0.061761 0.061879
max 11.850000 12.600000 11.700000 12.350000 12.350000 3.206319e+08 119.858154 5.527401 5.329967 1.376017 12.843129 10.904305 11.435714 1.919038 38.280030 4.250000 98.002505 1.309789 1.179590 0.402369 4.350000 39.395562 2.333333 91.918674 1.572467 81.231613 99.202564 97.153562 7.945218e+07 3.372961e+07 2.528742e+08 0.0 2.372284e+07 0.0 0.0 0.0 0.0 0.0 2.372284e+07 2.372284e+07 2.372284e+07 2.372284e+07 2.372284e+07 39.000000 2104.000000 2143.000000 2143.000000 2143.000000 2143.000000 2143.000000 0.0 119.858154 0.787812 0.177334 0.177334
In [1065]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1066]:
df = df.fillna(df.median())
In [1067]:
df.isna().sum()
Out[1067]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1068]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456 entries, 61 to 1516
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       1456 non-null   datetime64[ns]
 1   Open                       1456 non-null   float64       
 2   High                       1456 non-null   float64       
 3   Low                        1456 non-null   float64       
 4   Close                      1456 non-null   float64       
 5   Adj Close                  1456 non-null   float64       
 6   Volume                     1456 non-null   int64         
 7   Return                     1456 non-null   float64       
 8   Beta                       1456 non-null   float64       
 9   Variance                   1456 non-null   float64       
 10  AvgTrueRange               1456 non-null   float64       
 11  Upperband                  1456 non-null   float64       
 12  Lowerband                  1456 non-null   float64       
 13  Middleband                 1456 non-null   float64       
 14  APO                        1456 non-null   float64       
 15  NATR                       1456 non-null   float64       
 16  TRANGE                     1456 non-null   float64       
 17  DMI                        1456 non-null   float64       
 18  MACD                       1456 non-null   float64       
 19  MACDSIGNAL                 1456 non-null   float64       
 20  MACDHIST                   1456 non-null   float64       
 21  MOM                        1456 non-null   float64       
 22  PPO                        1456 non-null   float64       
 23  ROCP                       1456 non-null   float64       
 24  RSI                        1456 non-null   float64       
 25  TRIX                       1456 non-null   float64       
 26  ULTOSC                     1456 non-null   float64       
 27  SLOWK                      1456 non-null   float64       
 28  SLOWD                      1456 non-null   float64       
 29  AD                         1456 non-null   float64       
 30  ADOSC                      1456 non-null   float64       
 31  OBV                        1456 non-null   float64       
 32  Upward_momentum_created    1456 non-null   float64       
 33  Downward_momentum_created  1456 non-null   float64       
 34  B5_O_Um                    1456 non-null   float64       
 35  B5_C_Um                    1456 non-null   float64       
 36  B5_E_Um                    1456 non-null   float64       
 37  B5_A_Um                    1456 non-null   float64       
 38  B5_N_Um                    1456 non-null   float64       
 39  B5_O_Dm                    1456 non-null   float64       
 40  B5_C_Dm                    1456 non-null   float64       
 41  B5_E_Dm                    1456 non-null   float64       
 42  B5_A_Dm                    1456 non-null   float64       
 43  B5_N_Dm                    1456 non-null   float64       
 44  Verified_status_True       1456 non-null   int64         
 45  Verified_status_False      1456 non-null   int64         
 46  O                          1456 non-null   int64         
 47  C                          1456 non-null   int64         
 48  E                          1456 non-null   int64         
 49  A                          1456 non-null   int64         
 50  N                          1456 non-null   int64         
 51  Fake_news                  1456 non-null   int64         
 52  returns                    1456 non-null   float64       
 53  log_returns                1456 non-null   float64       
 54  vol_current                1456 non-null   float64       
 55  vol_future                 1456 non-null   float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 648.4 KB
In [1069]:
df.shape
Out[1069]:
(1456, 56)
In [1070]:
df=df.dropna()
In [1071]:
df.dtypes
Out[1071]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1072]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[1072]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f078419b790>
In [1073]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [1074]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 11 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
Upperband       0.785062
High            0.733270
Open            0.714507
Middleband      0.711005
Adj Close       0.698159
Close           0.698159
Low             0.682497
TRANGE          0.667846
Lowerband       0.593611
Variance        0.531689
Name: AvgTrueRange, dtype: float64
In [1075]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 3 strongly correlated values with NATR :
NATR           1.000000
vol_future     0.759750
vol_current    0.743874
Name: NATR, dtype: float64
In [1076]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with TRANGE:
TRANGE                       1.000000
AvgTrueRange                 0.667846
B5_A_Dm                      0.626346
B5_E_Dm                      0.626346
Downward_momentum_created    0.626346
B5_O_Dm                      0.626346
B5_C_Dm                      0.626346
B5_N_Dm                      0.626346
N                            0.590618
A                            0.590618
O                            0.590618
C                            0.590618
E                            0.590618
Verified_status_False        0.590556
Volume                       0.548068
Verified_status_True         0.528831
High                         0.524424
Name: TRANGE, dtype: float64
In [1077]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with Openness:
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999979
B5_N_Dm                      0.951897
B5_A_Dm                      0.951897
B5_E_Dm                      0.951897
B5_C_Dm                      0.951897
B5_O_Dm                      0.951897
Downward_momentum_created    0.951897
Volume                       0.922543
Verified_status_True         0.889589
TRANGE                       0.590618
Name: O, dtype: float64
In [1078]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with conscientiousness:
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999979
B5_N_Dm                      0.951897
B5_A_Dm                      0.951897
B5_E_Dm                      0.951897
B5_C_Dm                      0.951897
B5_O_Dm                      0.951897
Downward_momentum_created    0.951897
Volume                       0.922543
Verified_status_True         0.889589
TRANGE                       0.590618
Name: C, dtype: float64
In [1079]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with conscientiousness:
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999979
B5_N_Dm                      0.951897
B5_A_Dm                      0.951897
B5_E_Dm                      0.951897
B5_C_Dm                      0.951897
B5_O_Dm                      0.951897
Downward_momentum_created    0.951897
Volume                       0.922543
Verified_status_True         0.889589
TRANGE                       0.590618
Name: E, dtype: float64
In [1080]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with conscientiousness:
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999979
B5_N_Dm                      0.951897
B5_A_Dm                      0.951897
B5_E_Dm                      0.951897
B5_C_Dm                      0.951897
B5_O_Dm                      0.951897
Downward_momentum_created    0.951897
Volume                       0.922543
Verified_status_True         0.889589
TRANGE                       0.590618
Name: A, dtype: float64
In [1081]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with conscientiousness:
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999979
B5_N_Dm                      0.951897
B5_A_Dm                      0.951897
B5_E_Dm                      0.951897
B5_C_Dm                      0.951897
B5_O_Dm                      0.951897
Downward_momentum_created    0.951897
Volume                       0.922543
Verified_status_True         0.889589
TRANGE                       0.590618
Name: N, dtype: float64
In [1082]:
df.columns
Out[1082]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [1083]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1084]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1085]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1086]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1087]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1088]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.951897
A                            0.951897
E                            0.951897
C                            0.951897
O                            0.951897
Volume                       0.950682
Verified_status_False        0.950664
Verified_status_True         0.931489
TRANGE                       0.626346
Name: B5_O_Dm, dtype: float64
In [1089]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_C_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.951897
A                            0.951897
E                            0.951897
C                            0.951897
O                            0.951897
Volume                       0.950682
Verified_status_False        0.950664
Verified_status_True         0.931489
TRANGE                       0.626346
Name: B5_C_Dm, dtype: float64
In [1090]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_E_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.951897
A                            0.951897
E                            0.951897
C                            0.951897
O                            0.951897
Volume                       0.950682
Verified_status_False        0.950664
Verified_status_True         0.931489
TRANGE                       0.626346
Name: B5_E_Dm, dtype: float64
In [1091]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_A_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.951897
A                            0.951897
E                            0.951897
C                            0.951897
O                            0.951897
Volume                       0.950682
Verified_status_False        0.950664
Verified_status_True         0.931489
TRANGE                       0.626346
Name: B5_A_Dm, dtype: float64
In [1092]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.951897
A                            0.951897
E                            0.951897
C                            0.951897
O                            0.951897
Volume                       0.950682
Verified_status_False        0.950664
Verified_status_True         0.931489
TRANGE                       0.626346
Name: B5_N_Dm, dtype: float64
In [1093]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Real_or_Fake_tweet :
Series([], Name: Fake_news, dtype: float64)
In [1094]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.951897
A                            0.951897
E                            0.951897
C                            0.951897
O                            0.951897
Volume                       0.950682
Verified_status_False        0.950664
Verified_status_True         0.931489
TRANGE                       0.626346
Name: Downward_momentum_created, dtype: float64
In [1095]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1096]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Volume                       0.952901
B5_N_Dm                      0.931489
B5_A_Dm                      0.931489
B5_E_Dm                      0.931489
B5_C_Dm                      0.931489
B5_O_Dm                      0.931489
Downward_momentum_created    0.931489
N                            0.889589
A                            0.889589
E                            0.889589
C                            0.889589
O                            0.889589
Verified_status_False        0.886582
TRANGE                       0.528831
Name: Verified_status_True, dtype: float64
In [1097]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
N                            0.999979
A                            0.999979
E                            0.999979
C                            0.999979
O                            0.999979
B5_N_Dm                      0.950664
B5_A_Dm                      0.950664
B5_E_Dm                      0.950664
B5_C_Dm                      0.950664
B5_O_Dm                      0.950664
Downward_momentum_created    0.950664
Volume                       0.920629
Verified_status_True         0.886582
TRANGE                       0.590556
Name: Verified_status_False, dtype: float64
In [1098]:
sns.set(font_scale=0.8)
In [1099]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [1100]:
df.dtypes
Out[1100]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1101]:
df.isnull().sum()
Out[1101]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1102]:
df.fillna(0, inplace = True)
In [1103]:
df.dropna(inplace=True)
In [1104]:
sns.set(font_scale=0.8)
In [1105]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [1106]:
df.describe()
Out[1106]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1.456000e+03 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1.456000e+03 1.456000e+03 1.456000e+03 1456.0 1.456000e+03 1456.0 1456.0 1456.0 1456.0 1456.0 1.456000e+03 1.456000e+03 1.456000e+03 1.456000e+03 1.456000e+03 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.000000 1456.0 1456.000000 1456.000000 1456.000000 1456.000000
mean 3.852823 3.988063 3.706085 3.839966 3.839966 1.493650e+06 0.141592 0.503830 0.069330 0.287905 4.194042 3.490213 3.842127 -0.005112 7.892299 0.289842 35.639904 -0.006822 -0.008087 0.001265 -0.003970 -0.048906 0.017612 49.597316 -0.030799 48.383890 46.635105 46.710055 -5.412507e+07 -1.093755e+06 -2.185741e+07 0.0 1.911769e+05 0.0 0.0 0.0 0.0 0.0 1.911769e+05 1.911769e+05 1.911769e+05 1.911769e+05 1.911769e+05 0.133242 24.210852 24.344093 24.344093 24.344093 24.344093 24.344093 0.0 0.177999 -0.000173 0.054117 0.054189
std 1.844588 1.910945 1.774438 1.842730 1.842730 1.092444e+07 6.603548 0.501089 0.298857 0.171434 2.026469 1.690834 1.824600 0.395014 3.849392 0.292007 23.545583 0.273726 0.256286 0.085100 0.772013 9.424317 0.203692 13.325952 0.591980 10.448069 23.723869 22.048123 1.007101e+08 7.188866e+06 4.847996e+07 0.0 9.010427e+05 0.0 0.0 0.0 0.0 0.0 9.010427e+05 9.010427e+05 9.010427e+05 9.010427e+05 9.010427e+05 1.334231 93.159945 94.344869 94.344869 94.344869 94.344869 94.344869 0.0 6.623271 0.061497 0.030528 0.030491
min 0.870000 0.890000 0.830000 0.870000 0.870000 1.620000e+04 -46.460175 -2.059419 0.000000 0.065458 0.910470 0.312165 0.891429 -2.698782 3.307695 0.020000 0.056338 -1.440958 -1.279442 -0.630821 -5.590000 -37.683420 -0.603960 16.787461 -1.326921 19.913514 1.981568 3.113497 -4.869689e+08 -1.007706e+08 -1.277416e+08 0.0 0.000000e+00 0.0 0.0 0.0 0.0 0.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.0 -46.460175 -0.624744 0.011831 0.011831
25% 2.800000 2.900000 2.700000 2.800000 2.800000 1.498500e+05 -2.627586 0.239303 0.005605 0.176772 3.075897 2.587037 2.823571 -0.173718 5.622442 0.150000 15.672025 -0.133503 -0.128886 -0.029955 -0.310000 -5.024277 -0.093073 41.401541 -0.420763 41.358719 26.826771 28.547543 -2.427174e+07 -3.648210e+05 -1.323830e+07 0.0 1.349560e+04 0.0 0.0 0.0 0.0 0.0 1.349560e+04 1.349560e+04 1.349560e+04 1.349560e+04 1.349560e+04 0.000000 4.000000 4.000000 4.000000 4.000000 4.000000 4.000000 0.0 -2.631576 -0.026668 0.036120 0.036359
50% 3.570000 3.670000 3.430000 3.550000 3.550000 3.036000e+05 0.000000 0.489484 0.015502 0.238059 3.844928 3.276614 3.585714 -0.004391 7.061153 0.210000 33.099967 -0.007064 -0.008161 0.003051 0.000000 -0.126119 0.000000 48.980412 -0.055326 48.128664 45.881303 45.990499 -3.770521e+06 -2.138705e+04 -6.549300e+06 0.0 3.939631e+04 0.0 0.0 0.0 0.0 0.0 3.939631e+04 3.939631e+04 3.939631e+04 3.939631e+04 3.939631e+04 0.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 0.0 0.000000 0.000000 0.044847 0.045072
75% 4.390000 4.582500 4.222500 4.382500 4.382500 1.048650e+06 2.133535 0.778564 0.046110 0.328164 4.776274 4.026849 4.373929 0.162324 8.933023 0.350000 52.779046 0.098339 0.083535 0.040301 0.300000 4.812017 0.091442 58.565955 0.291668 55.333914 66.666643 64.371039 4.777705e+06 1.048516e+05 4.616325e+06 0.0 1.257554e+05 0.0 0.0 0.0 0.0 0.0 1.257554e+05 1.257554e+05 1.257554e+05 1.257554e+05 1.257554e+05 0.000000 19.000000 19.000000 19.000000 19.000000 19.000000 19.000000 0.0 2.227566 0.022031 0.061022 0.061022
max 11.850000 12.600000 11.700000 12.350000 12.350000 3.206319e+08 119.858154 5.527401 5.329967 1.376017 12.843129 10.904305 11.435714 1.919038 38.280030 4.250000 98.002505 1.309789 1.179590 0.402369 4.350000 39.395562 2.333333 91.918674 1.572467 81.231613 99.202564 97.153562 7.945218e+07 3.372961e+07 2.528742e+08 0.0 2.372284e+07 0.0 0.0 0.0 0.0 0.0 2.372284e+07 2.372284e+07 2.372284e+07 2.372284e+07 2.372284e+07 39.000000 2104.000000 2143.000000 2143.000000 2143.000000 2143.000000 2143.000000 0.0 119.858154 0.787812 0.177334 0.177334
In [1107]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [1108]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [1109]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [1111]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected ABUS
In [1112]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [1113]:
df.columns
Out[1113]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [1114]:
df.shape
Out[1114]:
(1517, 52)
In [1115]:
df.isnull().sum()
Out[1115]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           2
NATR                          0
TRANGE                        0
DMI                           0
MACD                         10
MACDSIGNAL                   10
MACDHIST                     10
MOM                           0
PPO                           2
ROCP                          0
RSI                           0
TRIX                         61
ULTOSC                        5
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [1116]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1117]:
df_weekly = df.resample('W').agg('mean')
In [1118]:
df_weekly.shape
Out[1118]:
(330, 51)
In [1119]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[1119]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0786d08d90>
In [1120]:
sns.set(font_scale=0.8)
In [1121]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [1122]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 11 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
Upperband       0.804180
TRANGE          0.785742
High            0.756013
Middleband      0.743197
Open            0.741985
Adj Close       0.731597
Close           0.731597
Low             0.718202
Lowerband       0.650420
Variance        0.571438
Name: AvgTrueRange, dtype: float64
In [1123]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 1 strongly correlated values with NATR :
NATR    1.0
Name: NATR, dtype: float64
In [1124]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 23 strongly correlated values with TRANGE:
TRANGE                       1.000000
AvgTrueRange                 0.785742
Upperband                    0.649997
High                         0.634473
Variance                     0.617296
Open                         0.600420
Close                        0.594415
Adj Close                    0.594415
B5_O_Dm                      0.586204
Downward_momentum_created    0.586204
B5_C_Dm                      0.586204
B5_E_Dm                      0.586204
B5_A_Dm                      0.586204
B5_N_Dm                      0.586204
Middleband                   0.572488
Low                          0.563559
A                            0.530420
O                            0.530420
C                            0.530420
E                            0.530420
N                            0.530420
Verified_status_False        0.530406
Volume                       0.503494
Name: TRANGE, dtype: float64
In [1125]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Openness:
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999988
B5_N_Dm                      0.965420
B5_A_Dm                      0.965420
B5_E_Dm                      0.965420
B5_C_Dm                      0.965420
B5_O_Dm                      0.965420
Downward_momentum_created    0.965420
Volume                       0.954595
Verified_status_True         0.914538
TRANGE                       0.530420
ADOSC                       -0.572185
Name: O, dtype: float64
In [1126]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999988
B5_N_Dm                      0.965420
B5_A_Dm                      0.965420
B5_E_Dm                      0.965420
B5_C_Dm                      0.965420
B5_O_Dm                      0.965420
Downward_momentum_created    0.965420
Volume                       0.954595
Verified_status_True         0.914538
TRANGE                       0.530420
ADOSC                       -0.572185
Name: C, dtype: float64
In [1127]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999988
B5_N_Dm                      0.965420
B5_A_Dm                      0.965420
B5_E_Dm                      0.965420
B5_C_Dm                      0.965420
B5_O_Dm                      0.965420
Downward_momentum_created    0.965420
Volume                       0.954595
Verified_status_True         0.914538
TRANGE                       0.530420
ADOSC                       -0.572185
Name: E, dtype: float64
In [1128]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999988
B5_N_Dm                      0.965420
B5_A_Dm                      0.965420
B5_E_Dm                      0.965420
B5_C_Dm                      0.965420
B5_O_Dm                      0.965420
Downward_momentum_created    0.965420
Volume                       0.954595
Verified_status_True         0.914538
TRANGE                       0.530420
ADOSC                       -0.572185
Name: A, dtype: float64
In [1129]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999988
B5_N_Dm                      0.965420
B5_A_Dm                      0.965420
B5_E_Dm                      0.965420
B5_C_Dm                      0.965420
B5_O_Dm                      0.965420
Downward_momentum_created    0.965420
Volume                       0.954595
Verified_status_True         0.914538
TRANGE                       0.530420
ADOSC                       -0.572185
Name: N, dtype: float64
In [1130]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1131]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1132]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1133]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1134]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1135]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.965420
A                            0.965420
E                            0.965420
C                            0.965420
O                            0.965420
Verified_status_False        0.964810
Volume                       0.960870
Verified_status_True         0.932391
TRANGE                       0.586204
Name: B5_O_Dm, dtype: float64
In [1136]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_C_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.965420
A                            0.965420
E                            0.965420
C                            0.965420
O                            0.965420
Verified_status_False        0.964810
Volume                       0.960870
Verified_status_True         0.932391
TRANGE                       0.586204
Name: B5_C_Dm, dtype: float64
In [1137]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_E_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.965420
A                            0.965420
E                            0.965420
C                            0.965420
O                            0.965420
Verified_status_False        0.964810
Volume                       0.960870
Verified_status_True         0.932391
TRANGE                       0.586204
Name: B5_E_Dm, dtype: float64
In [1138]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_A_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.965420
A                            0.965420
E                            0.965420
C                            0.965420
O                            0.965420
Verified_status_False        0.964810
Volume                       0.960870
Verified_status_True         0.932391
TRANGE                       0.586204
Name: B5_A_Dm, dtype: float64
In [1139]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.965420
A                            0.965420
E                            0.965420
C                            0.965420
O                            0.965420
Verified_status_False        0.964810
Volume                       0.960870
Verified_status_True         0.932391
TRANGE                       0.586204
Name: B5_N_Dm, dtype: float64
In [1140]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Real_or_Fake_tweet :
Series([], Name: Fake_news, dtype: float64)
In [1141]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
N                            0.965420
A                            0.965420
E                            0.965420
C                            0.965420
O                            0.965420
Verified_status_False        0.964810
Volume                       0.960870
Verified_status_True         0.932391
TRANGE                       0.586204
Name: Downward_momentum_created, dtype: float64
In [1142]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1143]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Volume                       0.955447
B5_N_Dm                      0.932391
B5_A_Dm                      0.932391
B5_E_Dm                      0.932391
B5_C_Dm                      0.932391
B5_O_Dm                      0.932391
Downward_momentum_created    0.932391
N                            0.914538
A                            0.914538
E                            0.914538
C                            0.914538
O                            0.914538
Verified_status_False        0.912547
Name: Verified_status_True, dtype: float64
In [1144]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
N                            0.999988
A                            0.999988
E                            0.999988
C                            0.999988
O                            0.999988
B5_N_Dm                      0.964810
B5_A_Dm                      0.964810
B5_E_Dm                      0.964810
B5_C_Dm                      0.964810
B5_O_Dm                      0.964810
Downward_momentum_created    0.964810
Volume                       0.953585
Verified_status_True         0.912547
TRANGE                       0.530406
ADOSC                       -0.573108
Name: Verified_status_False, dtype: float64
In [1145]:
sns.set(font_scale=0.8)
In [1146]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [1147]:
df_weekly.fillna(0, inplace = True)
In [1148]:
df_weekly.dropna(inplace=True)
In [1149]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [1150]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();